tarsec 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -48,7 +48,7 @@ parser("hello there"); // failure
48
48
  - [Pretty error messages](/tutorials/pretty-errors.md)
49
49
 
50
50
  ## Examples
51
- - [A markdown parser](/tests/examples/markdown.ts)
51
+ - [A CommonMark-ish markdown parser](/lib/parsers/markdown) — importable as `tarsec/parsers/markdown`. Supports headings (ATX 1–6 with optional trailing `#` stripping, plus setext), fenced and indented code blocks, multi-backtick inline code spans, multi-line / nested block quotes, ordered / unordered / nested lists, pipe tables with alignment, horizontal rules, HTML passthrough, VitePress-style YAML frontmatter, plus inline bold/italic (`*` and `_`), combined `***bold-italic***`, strikethrough, escapes, autolinks, hard *and* soft line breaks, images and links with optional `"title"`, and reference-style links / footnotes resolved in a post-parse pass. Paragraphs round-trip soft-wrapped lines through an `inline-soft-break` node. Inline emphasis, strike, and link content all nest, so ``**[link](u)**`` and ``*a `code` b*`` round-trip into the AST.
52
52
 
53
53
  Read more about [use cases for tarsec](/tutorials/use-case.md).
54
54
 
@@ -0,0 +1,14 @@
1
+ import { Parser } from "../../types.js";
2
+ import { Heading, CodeBlock, BlockQuote, Paragraph, HorizontalRule, List, Table, HTMLBlock } from "./types.js";
3
+ export { imageParser } from "./inline.js";
4
+ export declare const headingParser: Parser<Heading>;
5
+ export declare const codeBlockParser: Parser<CodeBlock>;
6
+ export declare const blockQuoteParser: Parser<BlockQuote>;
7
+ export declare const indentedCodeBlockParser: Parser<CodeBlock>;
8
+ export declare const setextHeadingParser: Parser<Heading>;
9
+ export declare const listParser: Parser<List>;
10
+ export declare const blankLine: Parser<unknown>;
11
+ export declare const htmlBlockParser: Parser<HTMLBlock>;
12
+ export declare const tableParser: Parser<Table>;
13
+ export declare const horizontalRuleParser: Parser<HorizontalRule>;
14
+ export declare const paragraphParser: Parser<Paragraph>;
@@ -0,0 +1,189 @@
1
+ import { seqC, seqR, capture, optional, or, manyTillStr, many1Till, exactly, many, many1, many1WithJoin, map, not, lazy, } from "../../combinators.js";
2
+ import { str, spaces, char, eof, set, alphanum, oneOf, noneOf, } from "../../parsers.js";
3
+ import { digit, letter } from "../../parsers.js";
4
+ import { manyTill } from "../../combinators.js";
5
+ import { inlineMarkdownParser, softBreakParser } from "./inline.js";
6
+ export { imageParser } from "./inline.js";
7
+ const languageChar = or(alphanum, oneOf("_+#.-"));
8
+ const languageTag = many1WithJoin(languageChar);
9
+ /* ATX heading marker: 1–6 consecutive `#`, not followed by another `#`.
10
+ * Try widest first so `###` doesn't parse as level 1 and leave `##` behind.
11
+ * `not(char("#"))` rejects 7+ `#` runs (they fall through to a paragraph). */
12
+ const atxMarker = or(...[6, 5, 4, 3, 2, 1].map((n) => map(seqR(exactly(n, char("#")), not(char("#"))), () => n)));
13
+ /* An optional trailing run of `#`s on an ATX heading: at least one separating
14
+ * space, one or more `#`, optional trailing spaces, then end-of-line. */
15
+ const trailingHashRun = seqR(many1(char(" ")), many1(char("#")), many(char(" ")), or(char("\n"), eof));
16
+ /* The heading body — everything up to (but not including) either the line end
17
+ * or a trailing `#` run. We capture this as a raw string then re-parse it as
18
+ * inline markdown so the body shape matches ATX/setext headings. */
19
+ const headingBody = many1Till(or(char("\n"), trailingHashRun));
20
+ export const headingParser = map(seqC(capture(atxMarker, "level"), spaces, capture(headingBody, "body"), optional(trailingHashRun), optional(char("\n"))), ({ level, body }) => {
21
+ const inner = many1(inlineMarkdownParser)(body);
22
+ return {
23
+ type: "heading",
24
+ level: level,
25
+ content: inner.success
26
+ ? inner.result
27
+ : [{ type: "inline-text", content: body }],
28
+ };
29
+ });
30
+ export const codeBlockParser = seqC(set("type", "code-block"), str("```"), capture(optional(languageTag), "language"), optional(spaces), capture(manyTillStr("```"), "content"), str("```"));
31
+ /* Multi-line and nested block quotes.
32
+ *
33
+ * - Consume consecutive lines beginning with "> " (the space is optional).
34
+ * - Join their stripped content with newlines.
35
+ * - Recursively re-parse the inner text: a sub-blockquote OR inline markdown.
36
+ *
37
+ * `lazy` defers the self-reference so we can recurse for nesting. */
38
+ const blockQuoteLine = map(seqC(char(">"), optional(char(" ")), capture(manyTillStr("\n"), "line"), or(char("\n"), eof)), ({ line }) => line);
39
+ // Inside the joined inner text, accept either a nested blockquote (possibly
40
+ // after a leading newline), a soft newline between lines, or any inline node.
41
+ const softNewline = map(char("\n"), () => ({ type: "inline-text", content: " " }));
42
+ const nestedBlockQuote = lazy(() => map(seqC(many(char("\n")), capture(blockQuoteParser, "quote")), ({ quote }) => quote));
43
+ const blockQuoteContent = or(nestedBlockQuote, softNewline, inlineMarkdownParser);
44
+ // Re-parse the joined inner text as a sequence of blockquote-content nodes.
45
+ // (We have to round-trip through a string because the `>` prefixes need to be
46
+ // stripped before nested blockquotes can be recognised.)
47
+ const reparseInner = (innerText) => {
48
+ const inner = many1(blockQuoteContent)(innerText);
49
+ return inner.success ? inner.result : [];
50
+ };
51
+ export const blockQuoteParser = map(many1(blockQuoteLine), (lines) => ({
52
+ type: "block-quote",
53
+ content: reparseInner(lines.join("\n")),
54
+ }));
55
+ /* Indented code block: one or more consecutive lines beginning with 4 spaces
56
+ * or a tab. The indent is stripped from each line. */
57
+ const indentPrefix = or(str(" "), char("\t"));
58
+ const indentedLine = map(seqC(indentPrefix, capture(manyTillStr("\n"), "line"), or(char("\n"), eof)), ({ line }) => line + "\n");
59
+ const indentedLines = map(many1(indentedLine), (lines) => lines.join(""));
60
+ export const indentedCodeBlockParser = seqC(set("type", "code-block"), set("language", null), capture(indentedLines, "content"));
61
+ /* Setext-style headings: a line of content followed by an underline of `=`
62
+ * (level 1) or `-` (level 2), terminated by `\n` or end-of-input. We capture
63
+ * the first line as a raw string, then re-parse it as inline markdown so the
64
+ * heading's content has the same shape as ATX headings. */
65
+ const setextLine = many1WithJoin(noneOf("\n"));
66
+ const setextH1Underline = map(many1(char("=")), () => 1);
67
+ const setextH2Underline = map(many1(char("-")), () => 2);
68
+ const _setextRaw = seqC(set("type", "heading"), capture(setextLine, "content"), char("\n"), capture(or(setextH1Underline, setextH2Underline), "level"), or(char("\n"), eof));
69
+ export const setextHeadingParser = map(_setextRaw, (caps) => {
70
+ const inner = many1(inlineMarkdownParser)(caps.content);
71
+ return {
72
+ type: "heading",
73
+ level: caps.level,
74
+ content: inner.success
75
+ ? inner.result
76
+ : [{ type: "inline-text", content: caps.content }],
77
+ };
78
+ });
79
+ const unorderedMarker = map(oneOf("-*+"), () => ({ ord: false, start: 1 }));
80
+ const orderedMarker = map(seqC(capture(many1WithJoin(digit), "digits"), char(".")), ({ digits }) => ({ ord: true, start: parseInt(digits, 10) }));
81
+ const indentOf = (n) => n > 0 ? str(" ".repeat(n)) : str("");
82
+ /* GFM task-list checkbox: `[ ]` (unchecked), `[x]` or `[X]` (checked).
83
+ * Must be followed by a single space (consumed) to count as a checkbox. */
84
+ const taskCheckbox = map(seqC(char("["), capture(or(char(" "), char("x"), char("X")), "mark"), str("] ")), ({ mark }) => mark !== " ");
85
+ const itemHeadOf = (indent, markerParser) => map(seqC(indentOf(indent), capture(markerParser, "marker"), char(" "), capture(optional(taskCheckbox), "checked"), capture(manyTillStr("\n"), "line"), or(char("\n"), eof)), ({ marker, checked, line }) => {
86
+ const raw = { marker, line };
87
+ if (checked !== null)
88
+ raw.checked = checked;
89
+ return raw;
90
+ });
91
+ const parseInline = (line) => {
92
+ const inline = many1(inlineMarkdownParser)(line);
93
+ return inline.success ? inline.result : [];
94
+ };
95
+ // One list item: an item-head followed by an optional sublist at +2 indent.
96
+ const itemWithSublist = (indent, markerParser) => map(seqC(capture(itemHeadOf(indent, markerParser), "raw"), capture(optional(lazy(() => listParserAt(indent + 2))), "sublist")), ({ raw, sublist }) => {
97
+ const item = { content: parseInline(raw.line) };
98
+ if (sublist)
99
+ item.sublist = sublist;
100
+ if (raw.checked !== undefined)
101
+ item.checked = raw.checked;
102
+ return { marker: raw.marker, item };
103
+ });
104
+ // A list of one or more items that all share a marker family.
105
+ const listOf = (indent, markerParser) => map(seqC(capture(itemWithSublist(indent, markerParser), "first"), capture(many(itemWithSublist(indent, markerParser)), "rest")), ({ first, rest }) => ({
106
+ type: "list",
107
+ ordered: first.marker.ord,
108
+ start: first.marker.start,
109
+ items: [first.item, ...rest.map((r) => r.item)],
110
+ }));
111
+ const listParserAt = (indent) => or(listOf(indent, unorderedMarker), listOf(indent, orderedMarker));
112
+ export const listParser = listParserAt(0);
113
+ /* Tables.
114
+ *
115
+ * Pipe-delimited GFM-style. A table is:
116
+ *
117
+ * | h1 | h2 | ← header row
118
+ * |----|:--:| ← separator row, with alignment markers
119
+ * | a | b | ← one or more data rows
120
+ *
121
+ * Each cell is `noneOf("|\n")`. We `map` the captured content to `.trim()`
122
+ * so headers/rows aren't padded with spaces. */
123
+ const cellContent = map(many1WithJoin(noneOf("|\n")), (s) => s.trim());
124
+ const cellThenBar = map(seqC(capture(cellContent, "cell"), char("|")), ({ cell }) => cell);
125
+ const tableRow = map(seqC(char("|"), capture(many1(cellThenBar), "cells"), or(char("\n"), eof)), ({ cells }) => cells);
126
+ const sepCell = map(seqC(many(char(" ")), capture(optional(char(":")), "left"), many1(char("-")), capture(optional(char(":")), "right"), many(char(" "))), ({ left, right }) => {
127
+ const leftColon = left !== null;
128
+ const rightColon = right !== null;
129
+ if (leftColon && rightColon)
130
+ return "center";
131
+ if (rightColon)
132
+ return "right";
133
+ if (leftColon)
134
+ return "left";
135
+ return null;
136
+ });
137
+ const sepCellThenBar = map(seqC(capture(sepCell, "cell"), char("|")), ({ cell }) => cell);
138
+ const sepRow = map(seqC(char("|"), capture(many1(sepCellThenBar), "cells"), or(char("\n"), eof)), ({ cells }) => cells);
139
+ /* HTML blocks (passthrough subset).
140
+ *
141
+ * A line starting with `<` followed by a letter, `/`, `!`, or `?` is treated
142
+ * as the start of a raw HTML block. The block extends until the next blank
143
+ * line or end of input. We don't try to balance tags — the content is kept
144
+ * as a single opaque string so downstream renderers can hand it to an HTML
145
+ * renderer untouched. */
146
+ const htmlBlockOpen = seqR(char("<"), or(letter, oneOf("/!?")));
147
+ // "\n" followed by zero or more spaces/tabs followed by another "\n" or end of input.
148
+ export const blankLine = seqR(char("\n"), many(oneOf(" \t")), or(char("\n"), eof));
149
+ // Peek at the opening (`not(not(...))` is a non-consuming lookahead), then
150
+ // consume everything up to the next blank line or eof.
151
+ export const htmlBlockParser = seqC(set("type", "html-block"), not(not(htmlBlockOpen)), capture(manyTill(or(blankLine, eof)), "content"));
152
+ export const tableParser = seqC(set("type", "table"), capture(tableRow, "headers"), capture(sepRow, "alignments"), capture(many1(tableRow), "rows"));
153
+ /* Horizontal rules: three-or-more of the same `-`, `*`, or `_`,
154
+ * with optional spaces between, ending in newline or eof. The "three or
155
+ * more" rule is expressed structurally — three explicit `char(c)`s followed
156
+ * by `many` more — so no count-and-validate wrapper is needed. */
157
+ const hrSpaces = many(char(" "));
158
+ const hrOf = (c) => map(seqR(hrSpaces, char(c), hrSpaces, char(c), hrSpaces, char(c), hrSpaces, many(seqR(char(c), hrSpaces)), or(char("\n"), eof)), () => ({ type: "horizontal-rule" }));
159
+ export const horizontalRuleParser = or(hrOf("-"), hrOf("*"), hrOf("_"));
160
+ // "\n" followed by zero or more spaces/tabs followed by another "\n" or end of input.
161
+ // (`blankLine` is declared near `htmlBlockParser` above, since both need it at
162
+ // module-eval time.)
163
+ /* Block-level constructs that, if they would start at the *current* line
164
+ * position, must interrupt a soft-wrapped paragraph instead of being eaten
165
+ * as inline content. Setext is intentionally excluded — its underline is
166
+ * resolved by `setextHeadingParser` running ahead of `paragraphParser` in
167
+ * the top-level dispatch. */
168
+ const blockInterrupt = or(
169
+ // ATX heading (1–6 `#` then a space)
170
+ seqR(atxMarker, char(" ")),
171
+ // Block quote
172
+ char(">"),
173
+ // Fenced code block
174
+ str("```"),
175
+ // Horizontal rule (3+ of -, *, or _ with optional intervening spaces)
176
+ horizontalRuleParser,
177
+ // List marker (unordered or `<digits>.`) followed by a space
178
+ seqR(or(oneOf("-*+"), seqR(many1(digit), char("."))), char(" ")),
179
+ // Table row
180
+ char("|"),
181
+ // HTML block opener
182
+ seqR(char("<"), or(letter, oneOf("/!?"))));
183
+ /* A paragraph node: an inline node OR a soft line break (single `\n` that
184
+ * isn't the start of a blank line *and* doesn't precede a block opener).
185
+ * Hard breaks (" \n" / "\\\n") win over soft breaks because they're
186
+ * matched earlier inside `inlineMarkdownParser`'s `or`. */
187
+ const paragraphSoftBreak = map(seqR(softBreakParser, not(blockInterrupt)), () => ({ type: "inline-soft-break" }));
188
+ const paragraphInline = map(seqC(not(blankLine), capture(or(paragraphSoftBreak, inlineMarkdownParser), "node")), ({ node }) => node);
189
+ export const paragraphParser = map(many1(paragraphInline), (content) => ({ type: "paragraph", content: content }));
@@ -0,0 +1,22 @@
1
+ /**
2
+ * YAML frontmatter parser for the Markdown example.
3
+ *
4
+ * Supports the spec from https://vitepress.dev/guide/frontmatter:
5
+ * ---
6
+ * title: Docs with VitePress
7
+ * editLink: true
8
+ * ---
9
+ *
10
+ * YAML coverage is a useful subset (top-level `key: value` only), built from
11
+ * Tarsec combinators per the project's "combinator-first" rule:
12
+ * - scalar values: bare strings, single/double-quoted strings, integers,
13
+ * floats, `true`/`false`, `null`/`~`
14
+ * - inline flow lists: [a, b, "c d"]
15
+ */
16
+ import { Parser } from "../../types.js";
17
+ import { Frontmatter } from "./types.js";
18
+ /**
19
+ * VitePress-style YAML frontmatter: a `---`-delimited block at the very top
20
+ * of a Markdown file.
21
+ */
22
+ export declare const frontmatterParser: Parser<Frontmatter>;
@@ -0,0 +1,80 @@
1
+ /**
2
+ * YAML frontmatter parser for the Markdown example.
3
+ *
4
+ * Supports the spec from https://vitepress.dev/guide/frontmatter:
5
+ * ---
6
+ * title: Docs with VitePress
7
+ * editLink: true
8
+ * ---
9
+ *
10
+ * YAML coverage is a useful subset (top-level `key: value` only), built from
11
+ * Tarsec combinators per the project's "combinator-first" rule:
12
+ * - scalar values: bare strings, single/double-quoted strings, integers,
13
+ * floats, `true`/`false`, `null`/`~`
14
+ * - inline flow lists: [a, b, "c d"]
15
+ */
16
+ import { capture, many, many1WithJoin, map, optional, or, seqC, seqR, sepBy, } from "../../combinators.js";
17
+ import { alphanum, char, eof, noneOf, oneOf, quotedString, str, } from "../../parsers.js";
18
+ // --- helpers -----------------------------------------------------------------
19
+ const hSpace = oneOf(" \t");
20
+ const hSpaces = many(hSpace);
21
+ const newlineOrEof = or(char("\n"), eof);
22
+ /** Strip the surrounding quote chars (`'`, `"`, or `` ` ``) added by `quotedString`. */
23
+ const stripQuotes = (s) => s.slice(1, -1);
24
+ /**
25
+ * Classify a trimmed bare scalar token into its YAML value.
26
+ * Booleans and null are matched exactly; otherwise we try numeric, else fall
27
+ * back to the raw string.
28
+ */
29
+ function classifyBare(raw) {
30
+ const s = raw.trim();
31
+ if (s === "true")
32
+ return true;
33
+ if (s === "false")
34
+ return false;
35
+ if (s === "null" || s === "~")
36
+ return null;
37
+ if (s.length > 0) {
38
+ const n = Number(s);
39
+ if (!Number.isNaN(n) && Number.isFinite(n))
40
+ return n;
41
+ }
42
+ return s;
43
+ }
44
+ // --- key ---------------------------------------------------------------------
45
+ // Conservative key chars: letters, digits, underscore, hyphen.
46
+ const keyChar = or(alphanum, oneOf("_-"));
47
+ const yamlKey = many1WithJoin(keyChar);
48
+ // --- scalar values -----------------------------------------------------------
49
+ // Quoted scalar — returns the inner string (quotes stripped).
50
+ const quotedScalar = map(quotedString, stripQuotes);
51
+ // Bare scalar in top-level context: runs to end-of-line.
52
+ const bareValueLine = map(many1WithJoin(noneOf("\n")), classifyBare);
53
+ // Bare scalar inside a flow list `[...]`: ends at `,` or `]` (or `\n`).
54
+ const bareValueInList = map(many1WithJoin(noneOf(",]\n")), classifyBare);
55
+ // One element of a flow list: optional leading whitespace, then quoted or bare.
56
+ const listElement = map(seqC(hSpaces, capture(or(quotedScalar, bareValueInList), "value")), ({ value }) => value);
57
+ // Inline flow list: `[a, b, "c d"]`
58
+ const flowList = map(seqC(char("["), capture(sepBy(char(","), listElement), "items"), hSpaces, char("]")), ({ items }) => items);
59
+ // Any value: prefer flow list, then quoted, then bare.
60
+ const yamlValue = or(flowList, quotedScalar, bareValueLine);
61
+ // --- entries / body ----------------------------------------------------------
62
+ // `key: value` — gap between `:` and value may include horizontal whitespace.
63
+ const yamlEntry = map(seqC(capture(yamlKey, "key"), char(":"), hSpaces, capture(yamlValue, "value")), ({ key, value }) => [key, value]);
64
+ // One terminated entry: `key: value\n` (or eof-terminated).
65
+ const entryLine = map(seqC(capture(yamlEntry, "entry"), newlineOrEof), ({ entry }) => entry);
66
+ const yamlBody = many(entryLine);
67
+ // --- frontmatter -------------------------------------------------------------
68
+ const fence = str("---");
69
+ const fenceLine = seqR(fence, optional(hSpaces), char("\n"));
70
+ const closingFence = seqR(fence, optional(hSpaces), newlineOrEof);
71
+ /**
72
+ * VitePress-style YAML frontmatter: a `---`-delimited block at the very top
73
+ * of a Markdown file.
74
+ */
75
+ export const frontmatterParser = map(seqC(fenceLine, capture(yamlBody, "entries"), closingFence), ({ entries }) => {
76
+ const data = {};
77
+ for (const [k, v] of entries)
78
+ data[k] = v;
79
+ return { type: "frontmatter", data };
80
+ });
@@ -0,0 +1,7 @@
1
+ export * from "./types.js";
2
+ export * from "./inline.js";
3
+ export * from "./blocks.js";
4
+ export * from "./references.js";
5
+ export * from "./frontmatter.js";
6
+ import { Parser } from "../../types.js";
7
+ export declare const markdownParser: Parser<unknown[]>;
@@ -0,0 +1,27 @@
1
+ export * from "./types.js";
2
+ export * from "./inline.js";
3
+ export * from "./blocks.js";
4
+ export * from "./references.js";
5
+ export * from "./frontmatter.js";
6
+ import { seq, sepBy, or, optional, many1, map } from "../../combinators.js";
7
+ import { spaces, newline } from "../../parsers.js";
8
+ import { headingParser, codeBlockParser, blockQuoteParser, paragraphParser, imageParser, horizontalRuleParser, setextHeadingParser, indentedCodeBlockParser, listParser, tableParser, htmlBlockParser, } from "./blocks.js";
9
+ import { linkDefinitionParser, footnoteDefinitionParser, resolveReferences, } from "./references.js";
10
+ import { frontmatterParser } from "./frontmatter.js";
11
+ // Block separator: one or more newlines (with optional trailing horizontal
12
+ // whitespace). Crucially this does NOT consume leading indentation on the
13
+ // next block — so a 4-space indented code block isn't dewhitespaced before
14
+ // indentedCodeBlockParser ever sees it.
15
+ const blockSeparator = many1(newline);
16
+ const _markdownParser = seq([
17
+ optional(frontmatterParser),
18
+ optional(spaces),
19
+ sepBy(blockSeparator, or(setextHeadingParser, horizontalRuleParser, headingParser, codeBlockParser, indentedCodeBlockParser, tableParser, blockQuoteParser, listParser, htmlBlockParser, linkDefinitionParser, footnoteDefinitionParser, paragraphParser, imageParser)),
20
+ optional(spaces),
21
+ ], (r) => {
22
+ const fm = r[0];
23
+ const blocks = r[2];
24
+ return fm ? [fm, ...blocks] : blocks;
25
+ });
26
+ // Resolve [id]: url definitions across the AST after parsing.
27
+ export const markdownParser = map(_markdownParser, (nodes) => resolveReferences(nodes));
@@ -0,0 +1,48 @@
1
+ import { Parser } from "../../types.js";
2
+ import { InlineMarkdown, InlineText, InlineBold, InlineItalic, InlineBoldItalic, InlineStrike, InlineHardBreak, InlineSoftBreak, InlineLink, InlineCode, Image, InlineRefLink, InlineRefImage, InlineFootnoteRef, InlineHTML } from "./types.js";
3
+ export declare const inlineTextParser: Parser<InlineText>;
4
+ /**
5
+ * Run `inlineMarkdownParser` repeatedly until `stop` would match at the
6
+ * current position. The `stop` parser is a lookahead — it is *not* consumed.
7
+ * Returns the list of inline nodes collected before `stop`.
8
+ *
9
+ * Used by every delimited inline parser (bold, italic, strike, link, …) so
10
+ * that the content between delimiters is a sequence of inline nodes rather
11
+ * than a flat string.
12
+ */
13
+ export declare const inlineSeqUntil: (stop: Parser<unknown>) => Parser<InlineMarkdown[]>;
14
+ export declare const inlineBoldParser: Parser<InlineBold>;
15
+ export declare const inlineItalicParser: Parser<InlineItalic>;
16
+ export declare const inlineLinkParser: Parser<InlineLink>;
17
+ export declare const inlineCodeParser: Parser<InlineCode>;
18
+ export declare const inlineEscapeParser: Parser<InlineText>;
19
+ export declare const inlineBoldItalicParser: Parser<InlineBoldItalic>;
20
+ export declare const inlineBoldUnderscoreParser: Parser<InlineBold>;
21
+ export declare const inlineItalicUnderscoreParser: Parser<InlineItalic>;
22
+ export declare const urlAutolinkParser: Parser<InlineLink>;
23
+ export declare const emailAutolinkParser: Parser<InlineLink>;
24
+ export declare const autolinkParser: Parser<InlineLink>;
25
+ export declare const bareUrlAutolinkParser: Parser<InlineLink>;
26
+ export declare const htmlOpenTagParser: Parser<InlineHTML>;
27
+ export declare const htmlCloseTagParser: Parser<InlineHTML>;
28
+ export declare const htmlCommentParser: Parser<InlineHTML>;
29
+ export declare const htmlInlineParser: Parser<InlineHTML>;
30
+ export declare const inlineFootnoteRefParser: Parser<InlineFootnoteRef>;
31
+ export declare const inlineRefLinkParser: Parser<InlineRefLink>;
32
+ export declare const inlineRefImageParser: Parser<InlineRefImage>;
33
+ /** An inline image: ![alt](url) or ![alt](url "title"). Lives in `inline.ts`
34
+ * so it can participate in paragraph parsing without `blocks.ts` becoming a
35
+ * circular dep. */
36
+ export declare const imageParser: Parser<Image>;
37
+ export declare const hardBreakParser: Parser<InlineHardBreak>;
38
+ /** A single `\n` that is *not* part of a blank line (which would terminate the
39
+ * enclosing paragraph). Hard breaks are matched earlier in `inlineMarkdownParser`'s
40
+ * `or` so a " \n" stays a hard break, never a soft one. */
41
+ export declare const softBreakParser: Parser<InlineSoftBreak>;
42
+ export declare const inlineStrikeParser: Parser<InlineStrike>;
43
+ export declare const htmlEntityParser: Parser<InlineText>;
44
+ /** Last-resort: consume a single delimiter char as literal text so unmatched
45
+ * delimiters (e.g. the `_` in snake_case_word, or a stray `*`) don't crash
46
+ * the paragraph. Matches one of the inline-text stop characters. */
47
+ export declare const inlineLiteralCharParser: Parser<InlineText>;
48
+ export declare const inlineMarkdownParser: Parser<InlineMarkdown>;
@@ -0,0 +1,249 @@
1
+ import { seqC, seqR, capture, captureCaptures, or, not, map, many, many1, many1Till, many1WithJoin, manyWithJoin, manyTillStr, iManyTillStr, count, exactly, lazy, } from "../../combinators.js";
2
+ import { str, char, eof, set, oneOf, alphanum, noneOf, digit, letter, anyChar } from "../../parsers.js";
3
+ import { success, failure } from "../../types.js";
4
+ import { optional, between } from "../../combinators.js";
5
+ // Stop inline-text at any single delimiter char OR at a hard-break sequence
6
+ // (" \n"+). Using many1Till with an `or` of delimiters makes the stop set
7
+ // composable rather than embedded inside a regex. `]` is included so that
8
+ // inline-text inside a link-text (`[...]`) terminates at the closing `]`.
9
+ const inlineTextStop = or(oneOf("*_`[]!<~\\&\n"), str(" "));
10
+ export const inlineTextParser = map(many1Till(inlineTextStop), (content) => ({ type: "inline-text", content }));
11
+ /**
12
+ * Run `inlineMarkdownParser` repeatedly until `stop` would match at the
13
+ * current position. The `stop` parser is a lookahead — it is *not* consumed.
14
+ * Returns the list of inline nodes collected before `stop`.
15
+ *
16
+ * Used by every delimited inline parser (bold, italic, strike, link, …) so
17
+ * that the content between delimiters is a sequence of inline nodes rather
18
+ * than a flat string.
19
+ */
20
+ export const inlineSeqUntil = (stop) => many(map(seqC(not(stop), capture(lazy(() => inlineMarkdownParser), "node")), ({ node }) => node));
21
+ export const inlineBoldParser = map(seqC(str("**"), capture(inlineSeqUntil(str("**")), "content"), str("**")), ({ content }) => ({ type: "inline-bold", content: content }));
22
+ export const inlineItalicParser = map(seqC(not(str("**")), char("*"), capture(inlineSeqUntil(char("*")), "content"), char("*")), ({ content }) => ({ type: "inline-italic", content: content }));
23
+ /* URL + optional title used by both inline-link and inline-image parsers.
24
+ * `urlToken` is whitespace- and `)`-terminated. Empty destinations (`[a]()`)
25
+ * are allowed via `manyWithJoin` (zero-or-more). `titleClause` is an
26
+ * optional leading-space-separated `"..."` or `'...'`. Both are pure
27
+ * combinator-based so the link/image parsers can share them. */
28
+ const urlToken = manyWithJoin(noneOf(" \t\n)"));
29
+ const titleClause = map(seqC(many1(char(" ")), captureCaptures(or(seqC(char('"'), capture(manyTillStr('"'), "title"), char('"')), seqC(char("'"), capture(manyTillStr("'"), "title"), char("'"))))), ({ title }) => title);
30
+ export const inlineLinkParser = map(seqC(char("["), capture(inlineSeqUntil(char("]")), "content"), str("]("), capture(urlToken, "url"), capture(optional(titleClause), "title"), char(")")), ({ content, url, title }) => {
31
+ const link = {
32
+ type: "inline-link",
33
+ content: content,
34
+ url,
35
+ };
36
+ if (title != null)
37
+ link.title = title;
38
+ return link;
39
+ });
40
+ /* Multi-backtick code spans.
41
+ *
42
+ * `foo` → "foo"
43
+ * ``a`b`` → "a`b" (close on exactly N backticks)
44
+ * `` foo `` → "foo" (strip one space on each side when both)
45
+ * ` ` → " " (don't strip if content is all spaces)
46
+ *
47
+ * The opener is a run of N backticks; the closer is another run of *exactly*
48
+ * N backticks. Body atoms are either a single non-tick char or a tick run
49
+ * whose length is *not* N (so it can't be misread as the closer). The opener
50
+ * count threads into the closer via a small wrapper — every other piece is
51
+ * combinator-shaped. */
52
+ const tickRun = count(char("`"));
53
+ const tickRunOf = (n) => seqR(exactly(n, char("`")), or(not(char("`")), eof));
54
+ const codeBodyAtom = (n) => or(noneOf("`"), map(seqR(not(tickRunOf(n)), many1(char("`"))), (parts) => parts[1].join("")));
55
+ const codeBody = (n) => manyWithJoin(codeBodyAtom(n));
56
+ const stripCodeSpan = (s) => s.length >= 2 && s.startsWith(" ") && s.endsWith(" ") && s.trim().length > 0
57
+ ? s.slice(1, -1)
58
+ : s;
59
+ export const inlineCodeParser = (input) => {
60
+ const opened = tickRun(input);
61
+ if (!opened.success)
62
+ return opened;
63
+ const n = opened.result;
64
+ const closed = map(seqR(codeBody(n), tickRunOf(n)), (parts) => stripCodeSpan(parts[0]))(opened.rest);
65
+ if (!closed.success) {
66
+ return failure("unmatched code span fence", input);
67
+ }
68
+ return success({ type: "inline-code", content: closed.result }, closed.rest);
69
+ };
70
+ const ESCAPABLE = "\\`*_{}[]()#+-.!~<>|";
71
+ export const inlineEscapeParser = seqC(set("type", "inline-text"), char("\\"), capture(oneOf(ESCAPABLE), "content"));
72
+ export const inlineBoldItalicParser = or(map(seqC(str("***"), capture(inlineSeqUntil(str("***")), "content"), str("***")), ({ content }) => ({
73
+ type: "inline-bold-italic",
74
+ content: content,
75
+ })), map(seqC(str("___"), capture(inlineSeqUntil(str("___")), "content"), str("___")), ({ content }) => ({
76
+ type: "inline-bold-italic",
77
+ content: content,
78
+ })));
79
+ export const inlineBoldUnderscoreParser = map(seqC(str("__"), capture(inlineSeqUntil(str("__")), "content"), str("__"), not(alphanum)), ({ content }) => ({ type: "inline-bold", content: content }));
80
+ export const inlineItalicUnderscoreParser = map(seqC(not(str("__")), char("_"), capture(inlineSeqUntil(char("_")), "content"), char("_"), not(alphanum)), ({ content }) => ({ type: "inline-italic", content: content }));
81
+ // URL body inside <...>: http(s)://<non-space, non-< or >>
82
+ const urlBody = map(seqR(str("http"), or(str("s"), str("")), str("://"), many1WithJoin(noneOf(" \t\n<>"))), (parts) => parts.join(""));
83
+ // Email body: local@domain.tld — no spaces, no < > or duplicates of @ inside parts
84
+ const emailPart = many1WithJoin(noneOf(" \t\n<>@."));
85
+ const emailBody = map(seqR(emailPart, char("@"), emailPart, char("."), emailPart), (parts) => parts.join(""));
86
+ // Wrap a literal string as the single-text content array used by InlineLink.
87
+ const asTextContent = (s) => [
88
+ { type: "inline-text", content: s },
89
+ ];
90
+ export const urlAutolinkParser = map(seqC(char("<"), capture(urlBody, "url"), char(">")), ({ url }) => ({ type: "inline-link", content: asTextContent(url), url }));
91
+ export const emailAutolinkParser = map(seqC(char("<"), capture(emailBody, "email"), char(">")), ({ email }) => ({
92
+ type: "inline-link",
93
+ content: asTextContent(email),
94
+ url: `mailto:${email}`,
95
+ }));
96
+ export const autolinkParser = or(urlAutolinkParser, emailAutolinkParser);
97
+ /* Bare-URL GFM autolinks: `http(s)://…` without surrounding `<>`. The body
98
+ * is built from three kinds of atom so the punctuation/paren-balance rules
99
+ * fall out of combinator composition:
100
+ * - `bareUrlParenGroup` — a balanced `(...)` (recursive via `lazy`), so
101
+ * Wikipedia-style URLs like `…Lisp_(programming_language)` keep their
102
+ * parens and an unmatched trailing `)` falls through to the surrounding
103
+ * text;
104
+ * - `bareUrlPunctMidway` — one of `.,!?;:` accepted *only* when at least
105
+ * one non-punct atom follows, so trailing sentence punctuation stays
106
+ * in the surrounding text (a `not(...)` lookahead does the work);
107
+ * - `bareUrlNormalChar` — any other URL char.
108
+ *
109
+ * `urlBodyStop` is the lookahead set that ends a URL outside a paren group:
110
+ * whitespace, `<`, `>`, `)`, or end-of-input. */
111
+ const bareUrlScheme = map(seqC(capture(str("http"), "scheme"), capture(optional(char("s")), "s"), str("://")), ({ scheme, s }) => scheme + (s !== null && s !== void 0 ? s : "") + "://");
112
+ const urlBodyStop = or(oneOf(" \t\n<>)"), eof);
113
+ const urlTrailingPunct = oneOf(".,!?;:");
114
+ const bareUrlNormalChar = noneOf(" \t\n<>().,!?;:");
115
+ const bareUrlPunctMidway = map(seqC(capture(urlTrailingPunct, "p"),
116
+ // Reject if the remainder is just more punct then a URL stop — that
117
+ // would mean this `.` (or `,`/`!`/etc) is part of a trailing run.
118
+ not(seqR(many(urlTrailingPunct), urlBodyStop))), ({ p }) => p);
119
+ const bareUrlAtom = lazy(() => or(bareUrlParenGroup, bareUrlPunctMidway, bareUrlNormalChar));
120
+ const bareUrlParenGroup = map(seqC(capture(char("("), "open"), capture(manyWithJoin(bareUrlAtom), "inner"), capture(char(")"), "close")), ({ open, inner, close }) => open + inner + close);
121
+ const bareUrlBody = many1WithJoin(bareUrlAtom);
122
+ export const bareUrlAutolinkParser = map(seqC(capture(bareUrlScheme, "scheme"), capture(bareUrlBody, "body")), ({ scheme, body }) => {
123
+ const url = scheme + body;
124
+ return {
125
+ type: "inline-link",
126
+ content: [{ type: "inline-text", content: url }],
127
+ url,
128
+ };
129
+ });
130
+ /* Inline HTML passthrough.
131
+ *
132
+ * Three CommonMark shapes are supported (each in its own exported parser):
133
+ * - open / self-closing tags: `<a>`, `<a href="x">`, `<br/>`,
134
+ * - close tags: `</a>`, `</a >`,
135
+ * - comments: `<!-- … -->`.
136
+ *
137
+ * The output is always an `InlineHTML` node whose `content` is the raw source
138
+ * (including the angle brackets), so downstream renderers pass it through
139
+ * untouched. We do not try to balance opens/closes or sanitise anything.
140
+ *
141
+ * The pieces below are shared helpers: `htmlTagName`, `htmlAttribute`,
142
+ * `htmlAttributes`, `htmlWS`. All built from combinators with named
143
+ * captures so the reconstructed string mirrors the source exactly. */
144
+ const htmlWS = manyWithJoin(oneOf(" \t\n"));
145
+ const htmlWS1 = many1WithJoin(oneOf(" \t\n"));
146
+ const htmlTagName = map(seqC(capture(letter, "first"), capture(manyWithJoin(or(alphanum, char("-"))), "rest")), ({ first, rest }) => first + rest);
147
+ const htmlAttrName = map(seqC(capture(or(letter, char("_"), char(":")), "first"), capture(manyWithJoin(or(alphanum, oneOf("_:.-"))), "rest")), ({ first, rest }) => first + rest);
148
+ const dqAttrValue = map(seqC(char('"'), capture(manyTillStr('"'), "v"), char('"')), ({ v }) => `"${v}"`);
149
+ const sqAttrValue = map(seqC(char("'"), capture(manyTillStr("'"), "v"), char("'")), ({ v }) => `'${v}'`);
150
+ const unquotedAttrValue = many1WithJoin(noneOf(" \t\n\"'=<>`"));
151
+ const htmlAttrValue = or(dqAttrValue, sqAttrValue, unquotedAttrValue);
152
+ /* Optional `= value` suffix on an attribute. Whitespace is allowed on both
153
+ * sides of the `=` per CommonMark. */
154
+ const htmlAttrEq = map(seqC(capture(htmlWS, "wsBefore"), char("="), capture(htmlWS, "wsAfter"), capture(htmlAttrValue, "v")), ({ wsBefore, wsAfter, v }) => `${wsBefore}=${wsAfter}${v}`);
155
+ const htmlAttribute = map(seqC(capture(htmlAttrName, "name"), capture(optional(htmlAttrEq), "eq")), ({ name, eq }) => name + (eq !== null && eq !== void 0 ? eq : ""));
156
+ /* Zero or more attributes, each separated from the previous token by
157
+ * at least one whitespace char. Returns the joined source (including the
158
+ * separating whitespace) so the outer parser can reconstruct the original. */
159
+ const htmlAttributes = manyWithJoin(map(seqC(capture(htmlWS1, "ws"), capture(htmlAttribute, "attr")), ({ ws, attr }) => ws + attr));
160
+ export const htmlOpenTagParser = map(seqC(char("<"), capture(htmlTagName, "name"), capture(htmlAttributes, "attrs"), capture(htmlWS, "ws"), capture(optional(char("/")), "selfClose"), char(">")), ({ name, attrs, ws, selfClose }) => ({
161
+ type: "inline-html",
162
+ content: `<${name}${attrs}${ws}${selfClose !== null && selfClose !== void 0 ? selfClose : ""}>`,
163
+ }));
164
+ export const htmlCloseTagParser = map(seqC(str("</"), capture(htmlTagName, "name"), capture(htmlWS, "ws"), char(">")), ({ name, ws }) => ({
165
+ type: "inline-html",
166
+ content: `</${name}${ws}>`,
167
+ }));
168
+ /* HTML comments: `<!-- … -->`. CommonMark rules:
169
+ * - the body may not contain `--`,
170
+ * - the body may not start or end with `>`.
171
+ *
172
+ * Expressed as pure combinators by baking the constraints into the body atom:
173
+ * - `not(str("-->"))` so we stop cleanly at the closer,
174
+ * - `not(str("--"))` rejects a `--` mid-body,
175
+ * - `not(seqR(char(">"), str("-->")))` is the "end-of-body `>` " rule —
176
+ * a `>` directly before the closer is rejected, since accepting it
177
+ * would let the comment end on `>`.
178
+ *
179
+ * The start-of-body `>` rule is enforced with one `not(char(">"))` placed
180
+ * before the body's `many1`. An empty body falls through to `optional`'s
181
+ * null branch, which leaves the input unconsumed so the closer can match
182
+ * immediately. */
183
+ const commentBodyChar = map(seqC(not(str("-->")), not(str("--")), not(seqR(char(">"), str("-->"))), capture(anyChar, "c")), ({ c }) => c);
184
+ const commentBody = map(optional(map(seqC(not(char(">")), capture(many1WithJoin(commentBodyChar), "body")), ({ body }) => body)), (body) => body !== null && body !== void 0 ? body : "");
185
+ export const htmlCommentParser = map(seqC(str("<!--"), capture(commentBody, "body"), str("-->")), ({ body }) => ({
186
+ type: "inline-html",
187
+ content: `<!--${body}-->`,
188
+ }));
189
+ /* Inline HTML dispatch. `htmlCommentParser` runs first so `<!--…-->` isn't
190
+ * stolen by `htmlOpenTagParser` (which would otherwise see `<!` and bail).
191
+ * `htmlCloseTagParser` runs before `htmlOpenTagParser` because the open-tag
192
+ * parser would accept `<` followed by a tag name and we want `</a>` to win
193
+ * over an attempted `<` + `/a` (which isn't a valid attribute shape anyway). */
194
+ export const htmlInlineParser = or(htmlCommentParser, htmlCloseTagParser, htmlOpenTagParser);
195
+ // Footnote reference: `[^id]` (id has no `]`, `\n`, or spaces).
196
+ export const inlineFootnoteRefParser = seqC(set("type", "inline-footnote-ref"), str("[^"), capture(many1WithJoin(noneOf("] \n\t")), "id"), char("]"));
197
+ // `[...]` where ... is one or more characters that aren't `]` or newline.
198
+ const bracketed = between(char("["), char("]"), noneOf("]\n"));
199
+ const bracketedAsString = map(bracketed, (chars) => chars.join(""));
200
+ export const inlineRefLinkParser = map(seqC(capture(bracketedAsString, "text"), capture(optional(bracketedAsString), "rawId"), not(char("(")) // disambiguate from inline link
201
+ ), ({ text, rawId }) => ({
202
+ type: "inline-ref-link",
203
+ text,
204
+ id: rawId && rawId.length > 0 ? rawId : text,
205
+ }));
206
+ export const inlineRefImageParser = map(seqC(char("!"), capture(bracketedAsString, "alt"), capture(optional(bracketedAsString), "rawId"), not(char("("))), ({ alt, rawId }) => ({
207
+ type: "inline-ref-image",
208
+ alt,
209
+ id: rawId && rawId.length > 0 ? rawId : alt,
210
+ }));
211
+ /** An inline image: ![alt](url) or ![alt](url "title"). Lives in `inline.ts`
212
+ * so it can participate in paragraph parsing without `blocks.ts` becoming a
213
+ * circular dep. */
214
+ export const imageParser = map(seqC(str("!["), capture(iManyTillStr("]("), "alt"), str("]("), capture(urlToken, "url"), capture(optional(titleClause), "title"), char(")")), ({ alt, url, title }) => {
215
+ const img = { type: "image", alt, url };
216
+ if (title != null)
217
+ img.title = title;
218
+ return img;
219
+ });
220
+ export const hardBreakParser = map(or(
221
+ // two-or-more trailing spaces then newline
222
+ seqR(str(" "), many(char(" ")), char("\n")),
223
+ // backslash then newline
224
+ seqR(char("\\"), char("\n"))), () => ({ type: "inline-hard-break" }));
225
+ /** A single `\n` that is *not* part of a blank line (which would terminate the
226
+ * enclosing paragraph). Hard breaks are matched earlier in `inlineMarkdownParser`'s
227
+ * `or` so a " \n" stays a hard break, never a soft one. */
228
+ export const softBreakParser = map(seqR(char("\n"), not(char("\n"))), () => ({ type: "inline-soft-break" }));
229
+ export const inlineStrikeParser = map(seqC(str("~~"), capture(inlineSeqUntil(str("~~")), "content"), str("~~")), ({ content }) => ({
230
+ type: "inline-strike",
231
+ content: content,
232
+ }));
233
+ /* HTML entities. Decodes:
234
+ * - the five XML-core named entities (`&amp;`, `&lt;`, `&gt;`, `&quot;`,
235
+ * `&apos;`) into their literal characters,
236
+ * - decimal numeric references (`&#NN;`),
237
+ * - hexadecimal numeric references (`&#xNN;` / `&#XNN;`).
238
+ *
239
+ * Unknown named entities (e.g. `&unknown;`) fail this parser and fall
240
+ * through to `inlineLiteralCharParser`, which emits a literal `&`. */
241
+ const namedEntity = or(map(str("&amp;"), () => "&"), map(str("&lt;"), () => "<"), map(str("&gt;"), () => ">"), map(str("&quot;"), () => '"'), map(str("&apos;"), () => "'"));
242
+ const decimalEntity = map(seqC(str("&#"), capture(many1WithJoin(digit), "digits"), char(";")), ({ digits }) => String.fromCodePoint(parseInt(digits, 10)));
243
+ const hexEntity = map(seqC(or(str("&#x"), str("&#X")), capture(many1WithJoin(oneOf("0123456789abcdefABCDEF")), "digits"), char(";")), ({ digits }) => String.fromCodePoint(parseInt(digits, 16)));
244
+ export const htmlEntityParser = map(or(hexEntity, decimalEntity, namedEntity), (content) => ({ type: "inline-text", content }));
245
+ /** Last-resort: consume a single delimiter char as literal text so unmatched
246
+ * delimiters (e.g. the `_` in snake_case_word, or a stray `*`) don't crash
247
+ * the paragraph. Matches one of the inline-text stop characters. */
248
+ export const inlineLiteralCharParser = seqC(set("type", "inline-text"), capture(oneOf("*_`[]!<~\\&"), "content"));
249
+ export const inlineMarkdownParser = or(hardBreakParser, inlineEscapeParser, inlineBoldItalicParser, inlineBoldParser, inlineItalicParser, inlineBoldUnderscoreParser, inlineItalicUnderscoreParser, inlineStrikeParser, autolinkParser, bareUrlAutolinkParser, htmlInlineParser, imageParser, inlineRefImageParser, inlineFootnoteRefParser, inlineLinkParser, inlineRefLinkParser, inlineCodeParser, htmlEntityParser, inlineTextParser, inlineLiteralCharParser);
@@ -0,0 +1,5 @@
1
+ import { Parser } from "../../types.js";
2
+ import { LinkDef, FootnoteDef } from "./types.js";
3
+ export declare const linkDefinitionParser: Parser<LinkDef>;
4
+ export declare const footnoteDefinitionParser: Parser<FootnoteDef>;
5
+ export declare function resolveReferences(ast: unknown[]): unknown[];
@@ -0,0 +1,96 @@
1
+ import { seqC, capture, optional, many1WithJoin, map, } from "../../combinators.js";
2
+ import { char, str, set, noneOf, spaces } from "../../parsers.js";
3
+ /* Reference link definitions.
4
+ *
5
+ * [id]: url
6
+ * [id]: url "title"
7
+ *
8
+ * Built entirely from combinators. The optional title is `seqR(spaces, "..."`
9
+ * unwrapped via `map`. */
10
+ const idChars = many1WithJoin(noneOf("]\n"));
11
+ const urlChars = many1WithJoin(noneOf(" \t\n"));
12
+ const titleChars = many1WithJoin(noneOf('"\n'));
13
+ const titleParser = map(seqC(spaces, char('"'), capture(titleChars, "title"), char('"')), ({ title }) => title);
14
+ export const linkDefinitionParser = seqC(set("type", "link-definition"), char("["), capture(idChars, "id"), str("]:"), spaces, capture(urlChars, "url"), optional(capture(titleParser, "title")));
15
+ /* Footnote definitions: `[^id]: text` on a single line. */
16
+ export const footnoteDefinitionParser = seqC(set("type", "footnote-definition"), str("[^"), capture(many1WithJoin(noneOf("] \n\t")), "id"), str("]:"), spaces, capture(many1WithJoin(noneOf("\n")), "content"));
17
+ /* Resolution pass.
18
+ *
19
+ * Walk the AST. Collect link-definitions, then rewrite ref nodes to inline
20
+ * links/images and strip the definitions. Id matching is case-insensitive. */
21
+ export function resolveReferences(ast) {
22
+ const linkDefs = new Map();
23
+ const footnoteDefs = new Map();
24
+ for (const node of ast) {
25
+ if (!isObj(node))
26
+ continue;
27
+ const t = node.type;
28
+ if (t === "link-definition") {
29
+ const def = node;
30
+ linkDefs.set(def.id.toLowerCase(), def);
31
+ }
32
+ else if (t === "footnote-definition") {
33
+ const def = node;
34
+ footnoteDefs.set(def.id.toLowerCase(), def);
35
+ }
36
+ }
37
+ function walk(node) {
38
+ if (Array.isArray(node))
39
+ return node.map(walk);
40
+ if (!isObj(node))
41
+ return node;
42
+ const obj = node;
43
+ if (obj.type === "inline-ref-link") {
44
+ const def = linkDefs.get(String(obj.id).toLowerCase());
45
+ if (def) {
46
+ const link = {
47
+ type: "inline-link",
48
+ content: [{ type: "inline-text", content: String(obj.text) }],
49
+ url: def.url,
50
+ };
51
+ if (def.title != null)
52
+ link.title = def.title;
53
+ return link;
54
+ }
55
+ return { type: "inline-text", content: `[${obj.text}]` };
56
+ }
57
+ if (obj.type === "inline-ref-image") {
58
+ const def = linkDefs.get(String(obj.id).toLowerCase());
59
+ if (def) {
60
+ const img = {
61
+ type: "image",
62
+ url: def.url,
63
+ alt: obj.alt,
64
+ };
65
+ if (def.title != null)
66
+ img.title = def.title;
67
+ return img;
68
+ }
69
+ return { type: "inline-text", content: `![${obj.alt}]` };
70
+ }
71
+ if (obj.type === "inline-footnote-ref") {
72
+ const def = footnoteDefs.get(String(obj.id).toLowerCase());
73
+ if (def) {
74
+ return { type: "inline-footnote-ref", id: obj.id, content: def.content };
75
+ }
76
+ return { type: "inline-text", content: `[^${obj.id}]` };
77
+ }
78
+ // recurse into known child-bearing fields
79
+ const out = Object.assign({}, obj);
80
+ for (const key of ["content", "items", "rows"]) {
81
+ if (Array.isArray(obj[key]))
82
+ out[key] = obj[key].map(walk);
83
+ }
84
+ if (obj.sublist)
85
+ out.sublist = walk(obj.sublist);
86
+ return out;
87
+ }
88
+ return ast
89
+ .filter((n) => !(isObj(n) &&
90
+ (n.type === "link-definition" ||
91
+ n.type === "footnote-definition")))
92
+ .map(walk);
93
+ }
94
+ function isObj(v) {
95
+ return typeof v === "object" && v !== null;
96
+ }
@@ -0,0 +1,125 @@
1
+ export type InlineMarkdown = InlineText | InlineSoftBreak | InlineBold | InlineItalic | InlineBoldItalic | InlineStrike | InlineHardBreak | InlineLink | InlineCode | Image | InlineRefLink | InlineRefImage | InlineFootnoteRef | InlineHTML;
2
+ export type InlineHTML = {
3
+ type: "inline-html";
4
+ /** Raw passthrough source including angle brackets. */
5
+ content: string;
6
+ };
7
+ export type InlineText = {
8
+ type: "inline-text";
9
+ content: string;
10
+ };
11
+ export type InlineBold = {
12
+ type: "inline-bold";
13
+ content: InlineMarkdown[];
14
+ };
15
+ export type InlineItalic = {
16
+ type: "inline-italic";
17
+ content: InlineMarkdown[];
18
+ };
19
+ export type InlineBoldItalic = {
20
+ type: "inline-bold-italic";
21
+ content: InlineMarkdown[];
22
+ };
23
+ export type InlineStrike = {
24
+ type: "inline-strike";
25
+ content: InlineMarkdown[];
26
+ };
27
+ export type InlineHardBreak = {
28
+ type: "inline-hard-break";
29
+ };
30
+ export type InlineSoftBreak = {
31
+ type: "inline-soft-break";
32
+ };
33
+ export type InlineLink = {
34
+ type: "inline-link";
35
+ content: InlineMarkdown[];
36
+ url: string;
37
+ title?: string;
38
+ };
39
+ export type InlineCode = {
40
+ type: "inline-code";
41
+ content: string;
42
+ };
43
+ export type Paragraph = {
44
+ type: "paragraph";
45
+ content: InlineMarkdown[];
46
+ };
47
+ export type Heading = {
48
+ type: "heading";
49
+ level: number;
50
+ content: InlineMarkdown[];
51
+ };
52
+ export type CodeBlock = {
53
+ type: "code-block";
54
+ content: string;
55
+ language: string | null;
56
+ };
57
+ export type BlockQuoteContent = InlineMarkdown | BlockQuote;
58
+ export type BlockQuote = {
59
+ type: "block-quote";
60
+ content: BlockQuoteContent[];
61
+ };
62
+ export type Image = {
63
+ type: "image";
64
+ url: string;
65
+ alt: string;
66
+ title?: string;
67
+ };
68
+ export type InlineRefLink = {
69
+ type: "inline-ref-link";
70
+ text: string;
71
+ id: string;
72
+ };
73
+ export type InlineRefImage = {
74
+ type: "inline-ref-image";
75
+ alt: string;
76
+ id: string;
77
+ };
78
+ export type ListItem = {
79
+ content: InlineMarkdown[];
80
+ sublist?: List;
81
+ /** GFM task-list state: `true` for `[x]`/`[X]`, `false` for `[ ]`, absent for plain items. */
82
+ checked?: boolean;
83
+ };
84
+ export type List = {
85
+ type: "list";
86
+ ordered: boolean;
87
+ start: number;
88
+ items: ListItem[];
89
+ };
90
+ export type HorizontalRule = {
91
+ type: "horizontal-rule";
92
+ };
93
+ export type Alignment = "left" | "right" | "center" | null;
94
+ export type Table = {
95
+ type: "table";
96
+ headers: string[];
97
+ alignments: Alignment[];
98
+ rows: string[][];
99
+ };
100
+ export type LinkDef = {
101
+ type: "link-definition";
102
+ id: string;
103
+ url: string;
104
+ title?: string;
105
+ };
106
+ export type InlineFootnoteRef = {
107
+ type: "inline-footnote-ref";
108
+ id: string;
109
+ /** Filled in by `resolveReferences` when a matching FootnoteDef exists. */
110
+ content?: string;
111
+ };
112
+ export type FootnoteDef = {
113
+ type: "footnote-definition";
114
+ id: string;
115
+ content: string;
116
+ };
117
+ export type HTMLBlock = {
118
+ type: "html-block";
119
+ content: string;
120
+ };
121
+ export type FrontmatterValue = string | number | boolean | null | FrontmatterValue[];
122
+ export type Frontmatter = {
123
+ type: "frontmatter";
124
+ data: Record<string, FrontmatterValue>;
125
+ };
@@ -0,0 +1,2 @@
1
+ /* AST types for the Markdown example parser. */
2
+ export {};
package/dist/parsers.js CHANGED
@@ -168,7 +168,23 @@ export const quotedString = trace("quotedString", (input) => {
168
168
  recordFailure(input, "a quoted string");
169
169
  return failure(`expected a quote, got ${escape(input[0])}`, input);
170
170
  }
171
- const closeIdx = input.indexOf(q, 1);
171
+ let closeIdx = -1;
172
+ let searchFrom = 1;
173
+ while (true) {
174
+ const idx = input.indexOf(q, searchFrom);
175
+ if (idx === -1)
176
+ break;
177
+ // Count consecutive backslashes before the quote
178
+ let backslashes = 0;
179
+ for (let i = idx - 1; i >= 1 && input[i] === "\\"; i--)
180
+ backslashes++;
181
+ if (backslashes % 2 === 0) {
182
+ // Even number of backslashes (including 0) means the quote is unescaped
183
+ closeIdx = idx;
184
+ break;
185
+ }
186
+ searchFrom = idx + 1;
187
+ }
172
188
  if (closeIdx === -1) {
173
189
  recordFailure(input, "a quoted string");
174
190
  return failure(`expected closing ${escape(q)}`, input);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "tarsec",
3
- "version": "0.2.1",
3
+ "version": "0.3.0",
4
4
  "description": "A parser combinator library for TypeScript, inspired by Parsec.",
5
5
  "homepage": "https://github.com/egonSchiele/tarsec",
6
6
  "scripts": {
@@ -19,6 +19,11 @@
19
19
  ".": {
20
20
  "import": "./dist/index.js",
21
21
  "require": "./dist/index.js"
22
+ },
23
+ "./parsers/markdown": {
24
+ "import": "./dist/parsers/markdown/index.js",
25
+ "require": "./dist/parsers/markdown/index.js",
26
+ "types": "./dist/parsers/markdown/index.d.ts"
22
27
  }
23
28
  },
24
29
  "type": "module",
@@ -38,4 +43,4 @@
38
43
  "typescript": "^5.4.2",
39
44
  "vitest": "^1.4.0"
40
45
  }
41
- }
46
+ }